Introduction

This project is on Home Mortgage Disclosure Act Data(HMDA), which requires many financial institutions to maintain, report, and publicly disclose information about mortgages. This dataset covers mortgage decisions for first-lien, owner-occupied, 1-4 family house made in 2016 for the state of Florida. # Data wrangling and exploratory data analysis ## Download and Preprocess data The data of is downloaded from https://github.com/fengqifang/HMDA/raw/master/hmda_lar_FL_2016.zip. Descriptions for each column can be found in https://cfpb.github.io/api/hmda/fields.html.

library(data.table)
library(dplyr)
library(ggplot2)
library(kableExtra)
library(dygraphs)
library(knitr)
library(plotly)
library(stringr)
library(lubridate)
library(Matrix)
dat <- fread("hmda_lar_FL_2016.csv",na.strings = c("",NA))
## 
Read 57.8% of 415338 rows
Read 415338 rows and 36 (of 36) columns from 0.237 GB file in 00:00:03
head(dat,n =3)
##    tract_to_msamd_income rate_spread population minority_population
## 1:    109.08000183105469          NA       5310    23.6299991607666
## 2:    110.05999755859375          NA       2428  10.789999961853027
## 3:     148.5800018310547          NA       3943   85.47000122070312
##    number_of_owner_occupied_units number_of_1_to_4_family_units
## 1:                           1486                          2252
## 2:                            819                          1465
## 3:                           1041                           925
##    loan_amount_000s hud_median_family_income applicant_income_000s
## 1:              205                    59600                    NA
## 2:              156                    51700                    42
## 3:              143                    48100                    43
##    state_name state_abbr sequence_number respondent_id
## 1:    Florida         FL         0026556    0000617677
## 2:    Florida         FL         0287915    22-3039688
## 3:    Florida         FL         0079973    38-2750395
##                                                             purchaser_type_name
## 1: Loan was not originated or was not sold in calendar year covered by register
## 2:                                                            Ginnie Mae (GNMA)
## 3:                                                          Freddie Mac (FHLMC)
##                                               property_type_name
## 1: One-to-four family dwelling (other than manufactured housing)
## 2: One-to-four family dwelling (other than manufactured housing)
## 3: One-to-four family dwelling (other than manufactured housing)
##             preapproval_name                   owner_occupancy_name
## 1: Preapproval was requested Owner-occupied as a principal dwelling
## 2:            Not applicable Owner-occupied as a principal dwelling
## 3:            Not applicable Owner-occupied as a principal dwelling
##                                   msamd_name loan_type_name
## 1:         Pensacola, Ferry Pass, Brent - FL    FHA-insured
## 2: Deltona, Daytona Beach, Ormond Beach - FL    FHA-insured
## 3:          Miami, Miami Beach, Kendall - FL   Conventional
##    loan_purpose_name        lien_status_name hoepa_status_name
## 1:     Home purchase Secured by a first lien  Not a HOEPA loan
## 2:     Home purchase Secured by a first lien  Not a HOEPA loan
## 3:     Home purchase Secured by a first lien  Not a HOEPA loan
##             edit_status_name denial_reason_name_1       county_name
## 1: Quality edit failure only                   NA   Escambia County
## 2:                        NA                   NA    Volusia County
## 3:                        NA                   NA Miami-Dade County
##    co_applicant_sex_name co_applicant_race_name_1
## 1:                Female                    White
## 2:       No co-applicant          No co-applicant
## 3:       No co-applicant          No co-applicant
##    co_applicant_ethnicity_name census_tract_number as_of_year
## 1:      Not Hispanic or Latino             0036.14       2016
## 2:             No co-applicant             0805.00       2016
## 3:             No co-applicant             0168.00       2016
##    application_date_indicator applicant_sex_name applicant_race_name_1
## 1:                          0               Male                 White
## 2:                          0               Male                 White
## 3:                          0               Male                 White
##    applicant_ethnicity_name                                 agency_name
## 1:   Not Hispanic or Latino        Consumer Financial Protection Bureau
## 2:   Not Hispanic or Latino        Consumer Financial Protection Bureau
## 3:       Hispanic or Latino Department of Housing and Urban Development
##    agency_abbr
## 1:        CFPB
## 2:        CFPB
## 3:         HUD

There are 36 columns and 415338 observations in the raw data. let’s look at the data type and missing data rate for each column.

dat %>% sapply(., function(x)sum(is.na(x))/length(x)) %>%
         kable("html",col.names = "Missing.Rate") %>%
         kable_styling(bootstrap_options = c("striped", "hover"))
Missing.Rate
tract_to_msamd_income 0.0010714
rate_spread 0.9247890
population 0.0008957
minority_population 0.0008957
number_of_owner_occupied_units 0.0009462
number_of_1_to_4_family_units 0.0014157
loan_amount_000s 0.0000000
hud_median_family_income 0.0008788
applicant_income_000s 0.0754687
state_name 0.0000000
state_abbr 0.0000000
sequence_number 0.0000000
respondent_id 0.0000000
purchaser_type_name 0.0000000
property_type_name 0.0000000
preapproval_name 0.0000000
owner_occupancy_name 0.0000000
msamd_name 0.0142727
loan_type_name 0.0000000
loan_purpose_name 0.0000000
lien_status_name 0.0000000
hoepa_status_name 0.0000000
edit_status_name 0.8339184
denial_reason_name_1 1.0000000
county_name 0.0003900
co_applicant_sex_name 0.0000000
co_applicant_race_name_1 0.0000000
co_applicant_ethnicity_name 0.0000000
census_tract_number 0.0008788
as_of_year 0.0000000
application_date_indicator 0.0000000
applicant_sex_name 0.0000000
applicant_race_name_1 0.0000000
applicant_ethnicity_name 0.0000000
agency_name 0.0000000
agency_abbr 0.0000000

Data Cleanning and Exploratory Data Analysis

Let’s examining the varibles one by one. The first 9 columns are continous number, and it needs to be converted numeric first. And for other column, we can convert them into factor variables for now.

names(dat)
##  [1] "tract_to_msamd_income"          "rate_spread"                   
##  [3] "population"                     "minority_population"           
##  [5] "number_of_owner_occupied_units" "number_of_1_to_4_family_units" 
##  [7] "loan_amount_000s"               "hud_median_family_income"      
##  [9] "applicant_income_000s"          "state_name"                    
## [11] "state_abbr"                     "sequence_number"               
## [13] "respondent_id"                  "purchaser_type_name"           
## [15] "property_type_name"             "preapproval_name"              
## [17] "owner_occupancy_name"           "msamd_name"                    
## [19] "loan_type_name"                 "loan_purpose_name"             
## [21] "lien_status_name"               "hoepa_status_name"             
## [23] "edit_status_name"               "denial_reason_name_1"          
## [25] "county_name"                    "co_applicant_sex_name"         
## [27] "co_applicant_race_name_1"       "co_applicant_ethnicity_name"   
## [29] "census_tract_number"            "as_of_year"                    
## [31] "application_date_indicator"     "applicant_sex_name"            
## [33] "applicant_race_name_1"          "applicant_ethnicity_name"      
## [35] "agency_name"                    "agency_abbr"
dat <- dat %>% mutate_at(vars(1:9),funs(as.numeric(.)))
dat <- as.data.frame(unclass(dat))
str(dat)
## 'data.frame':    415338 obs. of  36 variables:
##  $ tract_to_msamd_income         : num  109 110 149 157 171 ...
##  $ rate_spread                   : num  NA NA NA NA NA NA 5.4 NA NA NA ...
##  $ population                    : num  5310 2428 3943 6252 22670 ...
##  $ minority_population           : num  23.6 10.8 85.5 93 41 ...
##  $ number_of_owner_occupied_units: num  1486 819 1041 1380 6133 ...
##  $ number_of_1_to_4_family_units : num  2252 1465 925 1611 7918 ...
##  $ loan_amount_000s              : num  205 156 143 340 258 311 207 66 193 95 ...
##  $ hud_median_family_income      : num  59600 51700 48100 48100 57800 59600 59200 48100 48100 57800 ...
##  $ applicant_income_000s         : num  NA 42 43 138 72 67 92 30 50 50 ...
##  $ state_name                    : Factor w/ 1 level "Florida": 1 1 1 1 1 1 1 1 1 1 ...
##  $ state_abbr                    : Factor w/ 1 level "FL": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sequence_number               : Factor w/ 160472 levels "0000001","0000002",..: 26324 119968 64722 72968 294 91 51297 26667 279 122902 ...
##  $ respondent_id                 : Factor w/ 1347 levels "0000000053","0000000058",..: 719 1001 1114 1243 1077 490 923 1136 1227 1001 ...
##  $ purchaser_type_name           : Factor w/ 10 levels "Affiliate institution",..: 8 6 5 3 6 7 10 3 2 3 ...
##  $ property_type_name            : Factor w/ 1 level "One-to-four family dwelling (other than manufactured housing)": 1 1 1 1 1 1 1 1 1 1 ...
##  $ preapproval_name              : Factor w/ 3 levels "Not applicable",..: 3 1 1 1 1 1 1 1 1 1 ...
##  $ owner_occupancy_name          : Factor w/ 1 level "Owner-occupied as a principal dwelling": 1 1 1 1 1 1 1 1 1 1 ...
##  $ msamd_name                    : Factor w/ 24 levels "Cape Coral, Fort Myers - FL",..: 16 3 9 9 13 16 22 9 9 13 ...
##  $ loan_type_name                : Factor w/ 4 levels "Conventional",..: 2 2 1 1 2 1 1 1 1 1 ...
##  $ loan_purpose_name             : Factor w/ 3 levels "Home improvement",..: 2 2 2 3 2 3 2 3 3 3 ...
##  $ lien_status_name              : Factor w/ 1 level "Secured by a first lien": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hoepa_status_name             : Factor w/ 2 levels "HOEPA loan","Not a HOEPA loan": 2 2 2 2 2 2 2 2 2 2 ...
##  $ edit_status_name              : Factor w/ 1 level "Quality edit failure only": 1 NA NA NA NA NA NA NA NA NA ...
##  $ denial_reason_name_1          : Factor w/ 0 levels: NA NA NA NA NA NA NA NA NA NA ...
##  $ county_name                   : Factor w/ 67 levels "Alachua County",..: 16 64 43 43 48 16 52 43 43 57 ...
##  $ co_applicant_sex_name         : Factor w/ 5 levels "Female","Information not provided by applicant in mail, Internet, or telephone application",..: 1 4 4 4 4 4 4 1 4 3 ...
##  $ co_applicant_race_name_1      : Factor w/ 8 levels "American Indian or Alaska Native",..: 8 6 6 6 6 6 6 8 6 8 ...
##  $ co_applicant_ethnicity_name   : Factor w/ 5 levels "Hispanic or Latino",..: 5 3 3 3 3 3 3 1 3 5 ...
##  $ census_tract_number           : Factor w/ 2962 levels "0001.00","0001.01",..: 369 2554 1460 1413 1489 133 1845 649 1530 1699 ...
##  $ as_of_year                    : Factor w/ 1 level "2016": 1 1 1 1 1 1 1 1 1 1 ...
##  $ application_date_indicator    : Factor w/ 1 level "0": 1 1 1 1 1 1 1 1 1 1 ...
##  $ applicant_sex_name            : Factor w/ 4 levels "Female","Information not provided by applicant in mail, Internet, or telephone application",..: 3 3 3 3 1 3 3 3 1 1 ...
##  $ applicant_race_name_1         : Factor w/ 7 levels "American Indian or Alaska Native",..: 7 7 7 7 3 4 7 2 7 7 ...
##  $ applicant_ethnicity_name      : Factor w/ 4 levels "Hispanic or Latino",..: 4 4 1 4 4 4 4 4 1 4 ...
##  $ agency_name                   : Factor w/ 6 levels "Consumer Financial Protection Bureau",..: 1 1 2 2 2 3 2 2 2 1 ...
##  $ agency_abbr                   : Factor w/ 6 levels "CFPB","FDIC",..: 1 1 4 4 4 2 4 4 4 1 ...

tract_to_msamd_income It is the percentage of the median family income for the tract compared to the median family income for the MSA/MD, rounded to two decimal places.

p <-ggplot(dat, aes(x = "", y = tract_to_msamd_income,fill="tract_to_msamd_income[%]")) + geom_boxplot()
p <- ggplotly(p)
## Warning: Removed 445 rows containing non-finite values (stat_boxplot).
p

There are 445 missing values and the values ranging from 11 to 500.

rate_spread Rate spread for the loan is the difference between the loan’s annual percentage rate (APR) and the average prime offer rate (APOR). This value is after the loan approval, which alos can not be used in the prediction. It can be dropped directly. s

dat <- data.table(dat)[, rate_spread:= NULL]

population and minority_population There are 372 missing values in both variables.

p <- ggplot(dat,aes(population,fill = "population"))+geom_histogram(fill = "red")
p1 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 372 rows containing non-finite values (stat_bin).
p <- ggplot(dat,aes(minority_population,fill = "minority_population[%]"))+
        geom_histogram(fill = "blue")
p2 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 372 rows containing non-finite values (stat_bin).
p <- subplot(p1,p2)
p

number_of_owner_occupied_units and number_of_1_to_4_family_units

p <- ggplot(dat,aes(number_of_owner_occupied_units,fill = "owener_occ"))+geom_histogram(fill = "red")
p1 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 393 rows containing non-finite values (stat_bin).
p <- ggplot(dat,aes(number_of_1_to_4_family_units,fill = "1 to 4 family units"))+
        geom_histogram(fill = "blue")
p2 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 588 rows containing non-finite values (stat_bin).
p <- subplot(p1,p2)
p

Impute the missing value with median.

dat <- dat %>% mutate_at(vars(number_of_owner_occupied_units,number_of_1_to_4_family_units),funs(ifelse(is.na(.),median(.),.))) 

loan_amount_000s,hud_median_family_income,applicant_income_000s

summary(dat$loan_amount_000s)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   138.0   194.0   226.9   270.0 20800.0
summary(dat$loan_amount_000s)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   138.0   194.0   226.9   270.0 20800.0

These two variables have very long tails. We will need to transform it first and then plot

p <- ggplot(dat, aes(x="Loan amount",y = log(loan_amount_000s),fill = "loan_amount_000s")) + geom_boxplot(fill ="red")
p1 <- ggplotly(p)

p <- ggplot(dat, aes(x ="median family income",y = log(hud_median_family_income/1000),fill = "median_family_income_000s")) + geom_boxplot(fill ="blue")

p2 <- ggplotly(p)
## Warning: Removed 365 rows containing non-finite values (stat_boxplot).
p <- ggplot(dat, aes(x = "applicant income",y=log(applicant_income_000s),fill = "applicant_income_000s")) + geom_boxplot(fill = "green")
p3 <- ggplotly(p)
## Warning: Removed 31345 rows containing non-finite values (stat_boxplot).
subplot(p1,p2,p3)

There are a lot of outlier in loan amount and applicant income. Those units are in thounsands, which means the highest loan is over 20M and the family income is 10M. It is interesting to see the ratio of loan amount and family income.

dt <- data.table(dat)[,.(applicant_income_000s,loan_amount_000s)][,ratio := ifelse((applicant_income_000s>0 & loan_amount_000s >0),loan_amount_000s/applicant_income_000s,NA)]
p <- ggplot(dt,aes(x="loan_over_income",y = ratio, fill = "loan_over_income"))+ geom_boxplot()
ggplotly(p)
## Warning: Removed 31345 rows containing non-finite values (stat_boxplot).
summary(dt$ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.007   1.909   2.628   2.767   3.459 383.000   31345

The maximum ratio of loan amount over income is 383, which seems more like a typo when computing the loan amount. There are 13 records with ratio over 100. This can be dropped from the data directly.

sum(dt$ratio >100,na.rm = TRUE)
## [1] 13
dat$ratio <- dt$ratio
dat <- data.table(dat)[ratio<100|is.na(ratio)]

I’m going to use KNN to impute the missing data in applicant income, so I will keep all the NA in ratio, which can be filled in when predicting the applicant income.

state_name and state_abbr The data set is from Florida and these two fields are same for all records.They can be dropped directly.

dat <- data.table(dat)[,c("state_name","state_abbr") := NULL]

sequence_number and respondent_id sequence_number is a unique number generated for each loan. respondent_id is a code representing the bank or other financial institution that is reporting the loan or application

dat <- data.table(dat)[,sequence_number := as.numeric(as.character(sequence_number))]
unique(dat$respondent_id)
##    [1] 0000617677 22-3039688 38-2750395 7197000003 31-1076317 0000057564
##    [7] 13-6131491 41-1795868 65-0763054 76-0503625 20-5239910 75-2921540
##   [13] 26-0813134 0003525577 7811300008 47-0873092 0000697633 0000817824
##   [19] 26-2053389 11-3647435 59-2472569 26-0595342 0000233031 0000480228
##   [25] 52-2276553 7927200007 0000852218 0000024702 0000852320 26-3943038
##   [31] 86-0860478 0002888798 37-1542226 0000451965 0000342634 46-1342618
##   [37] 56-2237729 0000619877 38-2749215 46-1868641 75-3170028 7884500006
##   [43] 59-3225526 1252000009 31-1197926 20-3702275 2149009991 1517600004
##   [49] 54-0259290 26-4599244 0000146672 0002735146 27-4023565 0000476810
##   [55] 41-1726353 0000016595 75-2269700 39-2001010 20-8921389 7505400005
##   [61] 43-1965151 1227300009 0000395238 20-8083209 52-2102740 26-0698083
##   [67] 0000062417 33-0750812 46-2981456 36-4327855 0000035014 77-0605392
##   [73] 42-1739728 1705700001 0000915878 23-2769131 33-0975529 0000068476
##   [79] 0000061160 0000542528 0000006774 35-2195549 26-1773722 27-0684906
##   [85] 54-1822116 0000007783 33-0941669 54-2096352 0000000196 26-0777326
##   [91] 0000001303 0000059049 7983500003 65-0782023 0542409990 0000000872
##   [97] 57-1175755 7516800003 26-0362771 0000005727 34-1719615 3827009995
##  [103] 76-0236067 0000021122 0000068600 23-2470039 04-3212636 22-3603829
##  [109] 0000653648 52-2323186 72-1441479 0000497404 95-4267987 26-2049351
##  [115] 26-1117897 0000463735 0000024947 2317700005 22-3887207 65-0447670
##  [121] 0000011443 83-0368862 0000028316 0000702825 0000067297 26-2261031
##  [127] 54-2070914 1461700004 71-0456844 7257500009 52-2337558 95-4623407
##  [133] 0000001976 0000023416 26-0021318 0000024224 3027509990 30-0583815
##  [139] 0000000058 20-2053401 20-8745846 95-4762204 0000876634 0000546571
##  [145] 6443809990 0000339858 0003536407 0000019899 95-4866828 20-8544905
##  [151] 0000023922 0000456045 0000013303 26-3416474 41-2259092 0000057993
##  [157] 22-3554558 22-3470404 11-3412303 0000004239 7810600004 0000009788
##  [163] 0000066357 52-2091594 0002193616 68-0151632 0000703927 58-2442998
##  [169] 0000716195 75-2695327 59-3345547 0000060823 0000504713 0000068645
##  [175] 0000024080 65-0924915 34-1194858 26-1717830 0000018149 0000034607
##  [181] 37-1493496 20-8006279 46-3689142 54-1994393 7635500004 65-0855140
##  [187] 3833009998 0000057949 0000706707 0000067389 26-2593704 22-3626334
##  [193] 26-3295749 0002590037 0000023521 2126300006 0000023627 01-0726495
##  [199] 48-1236121 0000008663 38-3799175 84-1564935 26-0423240 01-0759540
##  [205] 0000024497 47-0912342 38-3564305 7756300009 20-4224234 58-2059073
##  [211] 26-3780954 0000765019 0000012437 87-0630553 26-2597540 26-1589507
##  [217] 7527300003 3000809992 0000005561 42-1554181 62-1532940 20-1832276
##  [223] 38-3434023 0003591282 0000934329 27-0349391 0000032974 43-0951349
##  [229] 62-1259997 54-2024584 0000014838 20-8978372 38-3616059 16-1686740
##  [235] 38-2434249 7687600009 0000058452 39-1767726 0000024671 0000706081
##  [241] 59-3256287 0000008120 0000000435 0000011734 26-1796172 46-3676810
##  [247] 0000011780 46-1728831 0000694904 46-3435079 0000060497 0000015219
##  [253] 46-0530020 27-2389039 58-2642920 0000010754 0000025080 02-0687335
##  [259] 0000010699 0002489805 20-4514405 34-1716542 90-0790926 37-1572886
##  [265] 0000510871 36-4485107 0000706809 0000022311 0003076248 0000008021
##  [271] 0000060816 0000705798 20-4866754 7272800006 68-0309242 0000723112
##  [277] 0000024719 0000067344 20-1255434 43-1918759 0000592448 0000112837
##  [283] 0000067521 0000707635 1635900004 39-1813773 0000208244 0267100004
##  [289] 20-0640473 0000016450 0000708186 99-0026713 0000656733 0000177957
##  [295] 41-1842999 0000061170 0000012311 0000062514 36-3987946 75-2674544
##  [301] 0000501105 20-5973457 0000008442 27-1190043 0000060087 20-5604550
##  [307] 57-0761331 0471809999 20-0192872 41-1914032 48-1148159 33-0962918
##  [313] 22-3747694 0000808176 59-2943096 0000062728 22-2812732 0000014740
##  [319] 0003532641 1910500003 0000067725 0000028151 0000014851 61-1306242
##  [325] 20-2471369 1768800000 63-1113916 0000068095 90-0724148 0000033440
##  [331] 0000068663 0003467828 0000229801 74-3038404 75-2725402 63-1246790
##  [337] 1125900008 01-0464609 0000064552 0000032296 47-3963992 76-0762186
##  [343] 0000716456 7442300004 39-1856165 0000023216 58-2200634 0000001213
##  [349] 22-2437469 0000068490 0000021440 05-0402708 0000020504 0000057131
##  [355] 1099800006 65-0882429 20-2928975 01-0681100 86-0431588 7280100005
##  [361] 20-2355296 80-0233937 13-4362989 7523800002 68-0295876 65-1038507
##  [367] 0000021284 27-3863726 0000058309 1522600000 0000008879 80-0860209
##  [373] 51-0517525 1100900005 0000004968 0000068301 38-2799035 20-8549934
##  [379] 95-3990375 0000057870 22-2894356 0000062715 52-2304304 1097500000
##  [385] 04-3694338 13-4127143 62-1494087 0003212149 26-3264687 22-3157857
##  [391] 0000057598 0004114567 20-2485875 41-1592297 0002182786 72-1545376
##  [397] 0002736291 22-3753355 0000000857 0000058694 20-1436988 20-2752826
##  [403] 0000068417 20-4685477 16-1313078 26-0335190 02-0594137 20-5238443
##  [409] 0003690662 27-0202259 0000718037 26-0360466 20-5741925 11-3073502
##  [415] 7976700006 0000712202 54-1870627 33-0231744 20-3450673 20-2409656
##  [421] 0000067668 27-0222046 0000708007 52-1845409 31-1690008 0000034233
##  [427] 33-0886403 0000000352 88-0209429 26-0707492 46-3537400 36-4115595
##  [433] 73-1577221 20-3708208 46-2886385 0000061525 0000715850 0000005582
##  [439] 20-4136310 75-2838184 0000022444 27-0022323 0000717955 0000210434
##  [445] 20-4255880 7275700004 0000024649 0000010146 27-1438405 0000712504
##  [451] 1519700008 20-2470783 11-3654971 26-2429669 0003150447 13-3753941
##  [457] 0003938186 0000058377 7776400006 0000009976 75-2585326 0000024833
##  [463] 0000005676 1173900006 0000878898 76-0578435 0000063262 74-2508160
##  [469] 02-0793125 27-4588183 0000057614 0004455073 0003395293 11-3198028
##  [475] 20-1792214 62-1859487 34-1633105 06-1484285 1821100000 26-1242154
##  [481] 95-4196389 0000021265 0001093317 0000067347 0000717974 0000707506
##  [487] 0000068187 0000646453 45-2965086 0000000176 0000413208 0000004043
##  [493] 0000016804 0000491224 0000058483 0000068598 0000031286 0000015977
##  [499] 20-5614029 20-8279218 0000015605 0000068445 0000024223 26-4030355
##  [505] 0000067955 0000018203 0000023850 0000034953 0003351440 20-8802502
##  [511] 0000015974 22-3665409 0000023773 0000023957 2003500009 0000016401
##  [517] 22-3626426 52-2321476 45-3143795 0000021468 0000068391 0000028513
##  [523] 0000063799 0003415463 0000003738 0000002182 0000020214 56-2103469
##  [529] 0000035599 0000006288 0000001027 27-3459350 0000016325 0000024823
##  [535] 30-0142226 0000057934 7230000003 0000057973 7056000000 95-4830207
##  [541] 0000011836 47-4252448 84-1412422 84-1594306 77-0717225 32-0016270
##  [547] 68-0539887 0000006665 84-1040263 59-3378746 36-4078960 20-0142846
##  [553] 0000067330 7851800005 76-0561995 0000057369 0000024326 0003844700
##  [559] 35-1616238 0000063069 0000019772 36-4176531 0000004735 0000068042
##  [565] 32-0293417 72-1505298 0000027205 0000023880 38-2699549 84-1496821
##  [571] 54-1094297 73-1374559 38-3305907 38-3365241 3840409990 0000606046
##  [577] 80-0743699 04-2802284 04-4354554 95-4523866 41-2031753 0000021717
##  [583] 68-0570826 0000003656 84-0927358 99-0059010 0000018708 0000023020
##  [589] 0000005673 0000068664 41-2181683 41-2277737 0000045826 0002771694
##  [595] 0000024703 0000058479 0000062882 0000057997 0000017587 0000017642
##  [601] 26-1234319 0000058376 0000022458 0000057425 0000057776 0000006053
##  [607] 27-1970388 0000011423 0000068287 0000015465 0000034363 54-1872422
##  [613] 63-1052225 0000011804 0000715018 59-3591757 0000011789 0000021292
##  [619] 0000035376 0000014619 33-0828099 0000830542 0000708039 31-1060110
##  [625] 14-1886719 0000058520 0000057874 54-1822464 0000027086 36-4226027
##  [631] 0000018600 85-0260899 0000030810 3418909997 14-1841762 0000005598
##  [637] 0000068571 0000022257 38-3358941 0003452105 0003303298 0000714970
##  [643] 01-0639308 01-0737466 1411700003 43-0889809 0003374412 0000017393
##  [649] 0000006248 31-1488064 26-3709040 0000280110 0000068465 0000067342
##  [655] 0002399119 0000000151 7079100007 1611300007 0000032737 11-3290207
##  [661] 0000004142 0000009366 0000035570 20-2693054 04-3554142 0000057071
##  [667] 0000058537 74-3061159 0000010023 58-2419109 0000080122 0000068539
##  [673] 0000010118 26-0084919 77-0672274 0000018763 04-7534967 91-1780488
##  [679] 86-0415227 0000067290 95-4482547 0000058380 0000061605 0000005754
##  [685] 0000068601 83-0345906 84-0849361 0000008796 0000024496 0001216826
##  [691] 7667200009 0000002285 0000057777 0000000803 0000001644 0000005500
##  [697] 0000035212 0000009260 0000014816 0000021041 99-0058081 0000024292
##  [703] 0000006328 46-3525878 0000023049 0000024557 0000026467 0000018297
##  [709] 0000019269 0000019628 0000020061 0000004596 0000001467 0003545324
##  [715] 0000067664 0000075633 0000001115 03-0488052 0000000340 0000311845
##  [721] 0000614313 20-0304793 0000016442 11-2961751 0000024748 0000506249
##  [727] 58-2307082 0000058762 0000708146 7053700008 0000034121 0000068524
##  [733] 0002652092 0000005716 0001842065 0000068607 20-3828708 0000058672
##  [739] 0003370133 0000035095 47-3632618 0000067332 0000026725 0000004745
##  [745] 0000057607 0000001390 0000067329 0000008141 0000001199 41-1941324
##  [751] 0000015748 0000005854 0000701437 0000065063 0000017914 84-1347792
##  [757] 1461400001 42-1720343 0000000077 0000000110 0000006627 0000007592
##  [763] 0000022883 0000034977 0000020250 0000058152 0000928618 0000060143
##  [769] 0000015496 0000020369 0000024077 0000024849 0000713570 0000749635
##  [775] 0000068215 0000773247 0003807826 0002845755 27-3049738 62-0997810
##  [781] 0000058243 0000019976 27-4541183 0000008333 0000009632 0000011444
##  [787] 87-0682600 0000022949 0000029782 13-4225190 0000066157 0000057382
##  [793] 0000008116 0000058262 0000009155 0000012458 11-2637419 0000058937
##  [799] 0000000967 0000009963 0000012773 0000022769 0000024131 0000024693
##  [805] 0000026647 0000067311 0000196800 0000018124 0000706432 0000700709
##  [811] 0000705902 04-3568208 0000067444 0000005907 04-2992161 0000761806
##  [817] 0000034153 16-1222429 36-3575871 0000032718 39-2051009 0000023709
##  [823] 0000019581 0000023523 0000014252 52-2075704 0000005588 0003120646
##  [829] 0000020600 0000023621 0000029690 0000068615 7638200000 0000035583
##  [835] 0000034348 0000005685 0000014488 0000066158 0000033527 0000062155
##  [841] 0000165628 0000169653 0000019506 0000028114 0000703920 0000704255
##  [847] 0000590976 20-5439763 0000853952 0003816510 02-0486480 52-1720545
##  [853] 0000015358 0003342671 26-4461592 0000035295 75-2945170 0000009620
##  [859] 0000058874 0000539340 0000018200 0000003657 0000010254 0000011459
##  [865] 0000090196 0000000622 0000517900 0000067190 0000703870 31-1484057
##  [871] 0000013534 0000008797 22-2999585 99-0027234 0000068448 0000017580
##  [877] 0000058206 0000010307 0000011286 0000004395 0000022157 0000005033
##  [883] 0000008941 0000010853 0000034092 0000065597 0000853112 0003248849
##  [889] 0000020068 0000023834 0000708043 0000007461 0000000619 0000000861
##  [895] 0000009007 0000009095 0000707938 0000010634 0000010880 0000001253
##  [901] 0000003025 0000012219 0000005528 0000016410 0000017017 0000007840
##  [907] 0000017374 0000018569 0000020448 0000009607 0000005694 0000021573
##  [913] 0000022592 0000542649 0000011112 0000011407 0000012383 0000025075
##  [919] 0000029209 0000014371 0000018566 0000018641 0000055130 0000022637
##  [925] 0000023852 0000017880 0000060238 0000061093 0000068536 0000057542
##  [931] 81-0611149 0000058127 0000024114 11-3469077 0000061090 0000016396
##  [937] 0000005774 0000067300 0000067874 0000139843 0000027552 7416700009
##  [943] 0000014369 0000058703 0000005766 0000717935 0000417626 0000014310
##  [949] 0000068574 0000034683 0000016748 0000700257 0000023329 20-1193192
##  [955] 0000000162 0000067132 0000001747 0000003121 0000003654 0000007238
##  [961] 0000008680 0000009123 0000011739 0000014361 0000016249 0000019200
##  [967] 0000020042 0000024107 0000024438 0000025103 0000060239 0000068375
##  [973] 0000057033 0000058359 0000058556 0000058648 0000034980 0000013955
##  [979] 22-3547567 0000066835 0000068344 16-1237621 0000601050 0002895152
##  [985] 0000060978 0000663245 0000667757 0000703536 0000703862 0000704192
##  [991] 0000713480 0000701158 51-0370645 0000717528 0000717936 0000763556
##  [997] 26-4803398 0000058725 0000058372 0000722777 20-3492069 0001007846
## [1003] 0002939001 11-3199045 16-1566654 0003437166 0000012955 20-0353048
## [1009] 0000007875 0000636771 0003918898 0000002805 02-0640271 0000002311
## [1015] 23-3102701 0000014843 0000001999 0000035406 3028209994 27-3627801
## [1021] 30-0376657 3833709994 0000034899 0000027237 61-1355156 0000023101
## [1027] 3804009998 65-0738229 11-3468642 05-0447801 0000003830 22-3048514
## [1033] 7229000002 0000058088 0000713964 74-3050221 0000207872 38-3024363
## [1039] 99-0703835 0000003576 0000016940 87-0623581 0000005599 0000035333
## [1045] 13-3670961 0000017713 0000014078 0000004857 0000003633 0000005495
## [1051] 0000005969 0000020976 0000002601 0000018930 0000006207 0000007217
## [1057] 0000007925 0000914648 0000014176 0000064084 0000019690 0000018800
## [1063] 0000700358 0000060484 0000058604 0000014723 0000001808 0000013790
## [1069] 0000223322 0000021704 0000236706 0000398350 0000272272 0000034620
## [1075] 0000058506 0000657459 0000068302 0000917742 0000189129 03-0527286
## [1081] 7107600005 0000007397 20-2408098 22-3682843 0000057249 0000002221
## [1087] 0000009314 0000011124 0000015292 0000024369 0000027280 0000034524
## [1093] 0000058257 0000058348 0000058608 20-8872754 0000019953 0000011236
## [1099] 0000016606 0000023827 0000001039 0000002360 0000011406 0000029786
## [1105] 0000014221 0000057841 0000023264 0000024297 0000027744 0000010958
## [1111] 0000700416 0000718058 0000150727 0000013791 04-2876549 0000379920
## [1117] 0000702631 72-0593959 0000744238 0000712423 7902600002 0000002747
## [1123] 0000001071 0000005380 0000009712 0000010180 0000013345 0000014062
## [1129] 2148900000 0000023570 0000015732 0000015032 42-1586715 0000016185
## [1135] 0000067403 0000068583 0000014991 0000024755 0000703019 34-1815339
## [1141] 7992700007 0000019307 7621100007 0000577128 0000613307 0000020179
## [1147] 0000846619 0000991340 0000057815 75-3197409 0000024685 0000016891
## [1153] 0000058302 0000058719 0000058892 0000018214 0000035029 0000700670
## [1159] 0000024279 0000014251 0000703879 0000485559 0000504311 76-0454396
## [1165] 0000014158 0000016889 0000064049 0000000233 0000068673 0000024349
## [1171] 0000002828 0000001926 80-0860660 0000066538 0000064920 0000009915
## [1177] 87-0788512 0000009170 0000034728 7310100001 0000032647 0000024169
## [1183] 0000024516 0000020157 0000020468 0000021220 0000058095 0000705852
## [1189] 0000012854 0000009923 0000021525 0000024430 0000031445 0000058000
## [1195] 0000061623 0000058176 0000701409 0000057358 0000712609 26-1146120
## [1201] 0000024566 0000002149 0000024811 0000058599 0000015302 0000342410
## [1207] 75-2511480 0000057091 0000011868 0000813947 11-3217224 0000024235
## [1213] 0000026912 0000034047 0000024425 0000013102 0000010646 0000012181
## [1219] 0000018288 0000019510 0000021751 0000024479 0000024573 0000026333
## [1225] 0000045551 0000063004 0000067316 0000702518 0000022488 0000702889
## [1231] 0000702818 0000708190 0000833749 0001010930 0000058387 0000024891
## [1237] 0000010223 03-0486044 0003303917 45-5349650 0000068654 0000590248
## [1243] 47-2565675 0000015085 0000025106 0000707101 51-0415301 0000058657
## [1249] 0000013944 99-0019753 0000016221 0000062897 0000090270 35-1985128
## [1255] 0000018283 11-3019327 20-8190898 0000023966 0000024136 0000024508
## [1261] 0000034643 0000067348 0000025093 0000000953 0000007346 0000024156
## [1267] 0000029878 0000057214 0000000053 0000063057 0000020666 0000004365
## [1273] 75-3103689 0000062537 0000009737 0000015289 0000023290 0000034325
## [1279] 0000057377 0000002812 0000068681 0000090140 0000186717 95-4671903
## [1285] 0000014207 0000017690 0000009486 0000058464 0000008128 0000028068
## [1291] 0000090203 0000016500 0000000159 0000008056 0000058548 0000353238
## [1297] 0000001617 0000019678 0000024034 0000024360 0000068485 0000058374
## [1303] 0000068665 0000096761 0000024523 0000716833 0003635533 0000751656
## [1309] 0000936855 0003850206 0000090215 0000024312 0000006058 0000716697
## [1315] 0000068354 0000016980 99-0090179 20-5728606 0000024359 0000028211
## [1321] 0000033758 0000004675 0000058615 0000060683 0000021630 0000030090
## [1327] 0000126553 0000000902 0000008136 0000008854 0000150035 0000057754
## [1333] 0000068423 0000354310 0000006910 0000009179 0000009408 0000348159
## [1339] 0000000172 0000017614 0000034533 0000022286 0000024802 0000033794
## [1345] 04-3508773 0002353595 0000013876
## 1347 Levels: 0000000053 0000000058 0000000077 0000000110 ... 99-0703835

There are 1347 different respondent id, the format are different, we change the “-” to 0. I want to look at the first 20 most respondent id.

dat <- data.table(dat)[,respondent_id_new := gsub("-","0",as.character(respondent_id))]
dt <- data.table(dat)[,.(respondent_id_new)][, ct := .N, by = respondent_id_new]
dt <- unique(dt)
dt <- dt[order(-ct)]
dt[1:20,]
##     respondent_id_new    ct
##  1:        7197000003 31583
##  2:        0000451965 19919
##  3:        2203039688 14111
##  4:        1306131491 12886
##  5:        0000146672  9246
##  6:        0000480228  9227
##  7:        5400259290  8217
##  8:        2604599244  8075
##  9:        2003702275  7572
## 10:        0000852218  7263
## 11:        0000233031  6538
## 12:        2600813134  6016
## 13:        3802750395  5861
## 14:        3300975529  5785
## 15:        0000617677  5333
## 16:        0000619877  5222
## 17:        5903345547  5068
## 18:        2149009991  4886
## 19:        2600595342  4672
## 20:        0002735146  4574

purchaser_type_name

unique(dat$purchaser_type_name)
##  [1] Loan was not originated or was not sold in calendar year covered by register
##  [2] Ginnie Mae (GNMA)                                                           
##  [3] Freddie Mac (FHLMC)                                                         
##  [4] Fannie Mae (FNMA)                                                           
##  [5] Life insurance company, credit union, mortgage bank, or finance company     
##  [6] Private securitization                                                      
##  [7] Commercial bank, savings bank or savings association                        
##  [8] Other type of purchaser                                                     
##  [9] Affiliate institution                                                       
## [10] Farmer Mac (FAMC)                                                           
## 10 Levels: Affiliate institution ...
dt <- data.table(dat)[,ct := .N,by = purchaser_type_name][,.(purchaser_type_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = purchaser_type_name,y = ct))+ geom_histogram(stat = "identity",fill = "blue") +  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+ theme(axis.text.x=element_text(angle=45,hjust=1,vjust=0.5,size = 5))
ggplotly(p)

property_type_name

unique(dat$property_type_name)
## [1] One-to-four family dwelling (other than manufactured housing)
## Levels: One-to-four family dwelling (other than manufactured housing)
sum(is.na(dat$property_type_name))
## [1] 0

There is only one value in the property_type_name and no missing value. This column can be dropped directly.

dat <- data.table(dat)[,property_type_name := NULL]

preapproval_name

unique(dat$preapproval_name)
## [1] Preapproval was requested     Not applicable               
## [3] Preapproval was not requested
## 3 Levels: Not applicable ... Preapproval was requested

Let’s look at the preapproval distribution.

dt <- data.table(dat)[,ct := .N,by = preapproval_name][,.(preapproval_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = preapproval_name,y = ct))+ geom_histogram(stat = "identity",fill = "blue") 
ggplotly(p)

owner_occupancy_name

unique(dat$owner_occupancy_name)
## [1] Owner-occupied as a principal dwelling
## Levels: Owner-occupied as a principal dwelling
sum(is.na(dat$owner_occupancy_name))
## [1] 0
dat <- data.table(dat)[,owner_occupancy_name := NULL]

msamd_name

dt <- data.table(dat)[,ct := .N,by = msamd_name][,.(msamd_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = msamd_name,y = ct))+ geom_histogram(stat = "identity",fill = "blue") +  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+ theme(axis.text.x=element_text(angle=45,hjust=1,vjust=0.5))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(p)

loan_type_name

dt <- data.table(dat)[,ct := .N,by = loan_type_name][,.(loan_type_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = loan_type_name,y = ct))+ geom_histogram(stat = "identity",fill = "blue") 
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(p)

loan_purpose_name

dt <- data.table(dat)[,ct := .N,by = loan_purpose_name][,.(loan_purpose_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = loan_purpose_name,y = ct))+ geom_histogram(stat = "identity",fill = "blue")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(p)

lien_status_name

dat <-data.table(dat)[,lien_status_name:= NULL]

hoepa_status_name

dt <- data.table(dat)[,ct := .N,by = hoepa_status_name][,.(hoepa_status_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = hoepa_status_name,y = ct))+ geom_histogram(stat = "identity",fill = "blue")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(p)
dat <- data.table(dat)[,hoepa_status := ifelse(as.character(hoepa_status_name) == "HOEPA loan",1,0)]

county_name

unique(dat$county_name)
##  [1] Escambia County     Volusia County      Miami-Dade County  
##  [4] Orange County       Pinellas County     Seminole County    
##  [7] Bay County          Santa Rosa County   Collier County     
## [10] Osceola County      Brevard County      Pasco County       
## [13] Lake County         Palm Beach County   Hillsborough County
## [16] Indian River County Broward County      St. Johns County   
## [19] DeSoto County       Lee County          Bradford County    
## [22] Monroe County       St. Lucie County    Highlands County   
## [25] Duval County        Levy County         Charlotte County   
## [28] Martin County       Gulf County         Walton County      
## [31] Flagler County      Leon County         Madison County     
## [34] Sumter County       Columbia County     Wakulla County     
## [37] Hernando County     Okaloosa County     Jackson County     
## [40] Okeechobee County   Hardee County       Clay County        
## [43] Calhoun County      Washington County   Nassau County      
## [46] <NA>                Suwannee County     Putnam County      
## [49] Hendry County       Taylor County       Union County       
## [52] Gadsden County      Dixie County        Franklin County    
## [55] Polk County         Jefferson County    Glades County      
## [58] Baker County        Liberty County      Lafayette County   
## [61] Hamilton County     Holmes County       Alachua County     
## [64] Manatee County      Citrus County       Sarasota County    
## [67] Gilchrist County    Marion County      
## 67 Levels: Alachua County Baker County Bay County ... Washington County
sum(is.na(dat$county_name))
## [1] 162
dim(dat[is.na(county_name)&!is.na(msamd_name)])
## [1]  0 33

There are 162 missing values in county. And those are also missing msamd information.

dt <- data.table(dat)[,ct := .N,by = county_name][,.(county_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x=reorder(county_name, ct), y=ct)) +
  geom_bar(stat="identity",fill = "blue") +
  theme(axis.text.y=element_text(size = 5))+
  coord_flip()
ggplotly(p)

edit_status_name

sum(is.na(dat$edit_status_name))
## [1] 346358
dat <- data.table(dat)[,edit_status_new := ifelse(is.na(edit_status_name),0,1)]

denial_reason_name_1

dat <- data.table(dat)[,denial_reason_name_1 := NULL]

applicant_sex_name and co_applicant_sex_name

unique(dat$applicant_sex_name)
## [1] Male                                                                             
## [2] Female                                                                           
## [3] Information not provided by applicant in mail, Internet, or telephone application
## [4] Not applicable                                                                   
## 4 Levels: Female ...
unique(dat$co_applicant_sex_name)
## [1] Female                                                                           
## [2] No co-applicant                                                                  
## [3] Male                                                                             
## [4] Information not provided by applicant in mail, Internet, or telephone application
## [5] Not applicable                                                                   
## 5 Levels: Female ...
dt <- data.table(dat)[,ct := .N,by = applicant_sex_name][,.(applicant_sex_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = applicant_sex_name,y = ct,fill = "applicant"))+ geom_histogram(stat = "identity",fill = "red")  
## Warning: Ignoring unknown parameters: binwidth, bins, pad
p1 <- ggplotly(p)

dt <- data.table(dat)[,ct := .N,by = co_applicant_sex_name][,.(co_applicant_sex_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = co_applicant_sex_name,y = ct,fill = "co_applicant"))+ geom_histogram(stat = "identity",fill = "blue")  
## Warning: Ignoring unknown parameters: binwidth, bins, pad
p2 <- ggplotly(p)
subplot(p1,p2)

applicant_race_name_1 and co_applicant_race_name_1

unique(dat$applicant_race_name_1)
## [1] White                                                                            
## [2] Black or African American                                                        
## [3] Information not provided by applicant in mail, Internet, or telephone application
## [4] Asian                                                                            
## [5] Native Hawaiian or Other Pacific Islander                                        
## [6] American Indian or Alaska Native                                                 
## [7] Not applicable                                                                   
## 7 Levels: American Indian or Alaska Native ... White
unique(dat$co_applicant_race_name_1)
## [1] White                                                                            
## [2] No co-applicant                                                                  
## [3] American Indian or Alaska Native                                                 
## [4] Information not provided by applicant in mail, Internet, or telephone application
## [5] Black or African American                                                        
## [6] Asian                                                                            
## [7] Native Hawaiian or Other Pacific Islander                                        
## [8] Not applicable                                                                   
## 8 Levels: American Indian or Alaska Native ... White
dt <- data.table(dat)[,ct := .N,by = applicant_race_name_1][,.(applicant_race_name_1,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = applicant_race_name_1,y = ct,fill = "applicant"))+ geom_histogram(stat = "identity",fill = "blue")+  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+ theme(axis.text.x=element_text(angle=45,hjust=1,vjust=0.5,size=5))  
## Warning: Ignoring unknown parameters: binwidth, bins, pad
p1 <- ggplotly(p)

dt <- data.table(dat)[,ct := .N,by = co_applicant_race_name_1][,.(co_applicant_race_name_1,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = co_applicant_race_name_1,y = ct,fill = "co_applicant"))+ geom_histogram(stat = "identity",fill = "red")+  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+ theme(axis.text.x=element_text(angle=45,hjust=1,vjust=0.5,size=5))  
## Warning: Ignoring unknown parameters: binwidth, bins, pad
p2 <- ggplotly(p)
subplot(p1,p2)

census_tract_number

dt <- data.table(dat)[,ct := .N,by = census_tract_number][,.(census_tract_number,ct)]
dt <- unique(dt)
summary(dt$ct)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    50.0    98.0   140.2   175.0  1657.0
dt <- dt[order(-ct)][1:20]
p <- ggplot(dt, aes(x=reorder(census_tract_number, ct), y=ct)) +
  geom_bar(stat="identity",fill = "blue")+
  coord_flip()
ggplotly(p)

applicant_ethnicity_name and co_applicant_ethnicity_name

dt <- data.table(dat)[,ct := .N,by = applicant_ethnicity_name][,.(applicant_ethnicity_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = applicant_ethnicity_name,y = ct,fill = "applicant"))+ geom_histogram(stat = "identity",fill = "blue")+  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+ theme(axis.text.x=element_text(angle=45,hjust=1,vjust=0.5,size=5))  
## Warning: Ignoring unknown parameters: binwidth, bins, pad
p1 <- ggplotly(p)

dt <- data.table(dat)[,ct := .N,by = co_applicant_ethnicity_name][,.(co_applicant_ethnicity_name,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = co_applicant_ethnicity_name,y = ct,fill = "co_applicant"))+ geom_histogram(stat = "identity",fill = "red")+  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))+ theme(axis.text.x=element_text(angle=45,hjust=1,vjust=0.5,size=5))  
## Warning: Ignoring unknown parameters: binwidth, bins, pad
p2 <- ggplotly(p)
subplot(p1,p2)

agency_name and agency_abbr We only need one of them in the data.

dat <- data.table(dat)[,agency_name := NULL]
dt <- data.table(dat)[,ct := .N,by = agency_abbr][,.(agency_abbr,ct)]
dt <- unique(dt)
p <- ggplot(dt, aes(x = agency_abbr,y = ct))+ geom_histogram(stat = "identity",fill = "blue")  
ggplotly(p)